In [1]:
#导入表格数据
import pandas as pd
df=pd.read_excel('C:\\Users\\hp\\Desktop\\keshe\\noonehot.xlsx')
In [2]:
df['first_product'] = df['first_product'].replace('C2H4', 'C₂H₄')
df['first_product'] = df['first_product'].replace('C2H5OH', 'C₂H₅OH')
df['first_product'] = df['first_product'].replace('CH3COOH', 'CH₃COOH')
df['first_product'] = df['first_product'].replace('CH3OH', 'CH₃OH')
df['first_product'] = df['first_product'].replace('CH4', 'CH₄')
df['first_product'] = df['first_product'].replace('H2C2O4', 'H₂C₂O₄')
In [3]:
# J列
In [4]:
df.rename(columns={'voltage': 'Voltage'}, inplace=True)
df.rename(columns={'electrolyte_flow_anolyte_rate': 'Electrolyte flow rate'}, inplace=True)
df.rename(columns={'CO2_flow_rate': 'CO₂ flow rate'}, inplace=True)
df.rename(columns={'MEA_area': 'MEA area'}, inplace=True)
df.rename(columns={'membrane_area': 'Membrane area'}, inplace=True)
df.rename(columns={'anode_area': 'Anode area'}, inplace=True)
df.rename(columns={'anode_catalyst_loading': 'Anode catalyst loading'}, inplace=True)
df.rename(columns={'cathode_area': 'Cathode area'}, inplace=True)
df.rename(columns={'cathode_catalyst_loading': 'Cathode catalyst loading'}, inplace=True)
df.rename(columns={'temperature': 'Temperature'}, inplace=True)
df.rename(columns={'electrolyte_1': 'Electrolyte concentration'}, inplace=True)
df.rename(columns={'copper_containing': 'Copper containing'}, inplace=True)
In [5]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import seaborn as sns
import shap

embedding_columns = ['Material']
bool_column = ['Copper containing']

# 定义特征列和目标列
X = df[embedding_columns + bool_column + ['Voltage', 'Electrolyte flow rate','MEA configuration',
                                          'Electrolyte', 'Membrane','Anode','CO₂ flow rate',
                                                               'MEA area', 'Membrane area', 'Anode area',
                                                               'Anode catalyst loading', 'Cathode area',
                                                               'Cathode catalyst loading', 'Temperature', 'Electrolyte concentration']]
y = df['first_product']

# 对目标列进行编码
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
# 获取编码后的数字和原始元素的对应关系
encoded_to_original = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))

# 打印对应关系
for encoded_value, original_value in encoded_to_original.items():
    print(f"编码值: {encoded_value} 对应原始值: {original_value}")
# 划分训练集和测试集
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y_encoded, train_size=0.75, test_size=0.25, random_state=42)
编码值: 0 对应原始值: CH₃COOH
编码值: 1 对应原始值: CH₃OH
编码值: 2 对应原始值: CH₄
编码值: 3 对应原始值: CO
编码值: 4 对应原始值: C₂H₄
编码值: 5 对应原始值: C₂H₅OH
编码值: 6 对应原始值: HCOOH
编码值: 7 对应原始值: H₂C₂O₄
In [6]:
# K 折交叉验证
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []
precisions = []
recalls = []
f1_scores = []

for train_index, test_index in kf.split(X_trainval):
    X_train, X_test_fold = X_trainval.iloc[train_index], X_trainval.iloc[test_index]
    y_train, y_test_fold = y_trainval[train_index], y_trainval[test_index]

    # 创建随机森林分类器
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

    # 在训练集上进行训练
    rf_classifier.fit(X_train, y_train)

    # 在测试集上进行预测
    y_pred_fold = rf_classifier.predict(X_test_fold)

    # 计算评估指标
    accuracy = accuracy_score(y_test_fold, y_pred_fold)
    precision = precision_score(y_test_fold, y_pred_fold, average='weighted',zero_division=1)
    recall = recall_score(y_test_fold, y_pred_fold, average='weighted',zero_division=1)
    f1 = f1_score(y_test_fold, y_pred_fold, average='weighted',zero_division=1)

    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# 打印平均评估指标
print(f"Average Accuracy on KFold: {np.mean(accuracies)}")
print(f"Average Precision on KFold: {np.mean(precisions)}")
print(f"Average Recall on KFold: {np.mean(recalls)}")
print(f"Average F1 Score on KFold: {np.mean(f1_scores)}")

# 在测试集上评估并绘制混淆矩阵
y_pred_test = rf_classifier.predict(X_test)
accuracy_test = accuracy_score(y_test, y_pred_test)
conf_matrix = confusion_matrix(y_test, y_pred_test)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion matrix with semantic information')
plt.xlabel('Predicted')
# 设置y轴标签水平显示
plt.yticks(rotation=0)
plt.ylabel('Actual')
plt.savefig('JCMyuyi.jpg', bbox_inches = 'tight',dpi=1000)
plt.show()
shap_feat = X.values
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(shap_feat, y_encoded)
explainer = shap.TreeExplainer(rf_classifier)
shap_values = explainer.shap_values(shap_feat)


list_of_2d_arrays = [shap_values[:, :, i] for i in range(8)]
plt.figure()
shap.summary_plot(list_of_2d_arrays, shap_feat, feature_names=X.columns, class_names=label_encoder.classes_,show=False)
plt.xlabel('SHAP value')
plt.savefig('Jzongtissp.jpg',dpi=1000)
plt.show()
# 绘制每个类别的SHAP dot plot并保存为png
for i in range(8):
    plt.figure()
    shap.summary_plot(shap_values[:, :, i], X, plot_type="dot", show=False)
    # 根据编码值获取对应的原始类别名称来设置xlabel
    category_name = encoded_to_original[i]
    plt.xlabel(category_name)
    plt.savefig(f'Jshap{i}.jpg',dpi=1000)
    plt.show()
Average Accuracy on KFold: 0.8997402597402597
Average Precision on KFold: 0.9096607297011046
Average Recall on KFold: 0.8997402597402597
Average F1 Score on KFold: 0.8920270036592957
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [7]:
# Q列
In [8]:
# 处理total_current_density列成为分类变量
def categorize_current(current):
    if 0 < current <= 100:
        return 'Category_1'
    elif 100 < current <= 180:
        return 'Category_2'
    elif 180 < current <= 1000:
        return 'Category_3'    
df['tcd_category'] = df['total_current_density'].apply(categorize_current)
print(df['tcd_category'])
0      Category_2
1      Category_1
2      Category_3
3      Category_3
4      Category_1
          ...    
368    Category_2
369    Category_2
370    Category_3
371    Category_3
372    Category_3
Name: tcd_category, Length: 373, dtype: object
In [9]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import seaborn as sns
import shap

embedding_columns = ['Material']
bool_column = ['Copper containing']

# 定义特征列和目标列
X = df[embedding_columns + bool_column   + ['Voltage', 'Electrolyte flow rate','MEA configuration',
                                          'Electrolyte', 'Membrane','Anode','CO₂ flow rate',
                                                               'MEA area', 'Membrane area', 'Anode area',
                                                               'Anode catalyst loading', 'Cathode area',
                                                               'Cathode catalyst loading', 'Temperature', 'Electrolyte concentration']]
y = df['tcd_category']

# 对目标列进行编码
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
# 获取编码后的数字和原始元素的对应关系
encoded_to_original = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))

# 打印对应关系
for encoded_value, original_value in encoded_to_original.items():
    print(f"编码值: {encoded_value} 对应原始值: {original_value}")
# 划分训练集和测试集
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y_encoded, train_size=0.75, test_size=0.25, random_state=42)
# K 折交叉验证
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []
precisions = []
recalls = []
f1_scores = []

for train_index, test_index in kf.split(X_trainval):
    X_train, X_test_fold = X_trainval.iloc[train_index], X_trainval.iloc[test_index]
    y_train, y_test_fold = y_trainval[train_index], y_trainval[test_index]

    # 创建随机森林分类器
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

    # 在训练集上进行训练
    rf_classifier.fit(X_train, y_train)

    # 在测试集上进行预测
    y_pred_fold = rf_classifier.predict(X_test_fold)

    # 计算评估指标
    accuracy = accuracy_score(y_test_fold, y_pred_fold)
    precision = precision_score(y_test_fold, y_pred_fold, average='weighted',zero_division=1)
    recall = recall_score(y_test_fold, y_pred_fold, average='weighted',zero_division=1)
    f1 = f1_score(y_test_fold, y_pred_fold, average='weighted',zero_division=1)

    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# 打印平均评估指标
print(f"Average Accuracy on KFold: {np.mean(accuracies)}")
print(f"Average Precision on KFold: {np.mean(precisions)}")
print(f"Average Recall on KFold: {np.mean(recalls)}")
print(f"Average F1 Score on KFold: {np.mean(f1_scores)}")

# 在测试集上评估并绘制混淆矩阵
y_pred_test = rf_classifier.predict(X_test)
accuracy_test = accuracy_score(y_test, y_pred_test)
conf_matrix = confusion_matrix(y_test, y_pred_test)


plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion matrix with semantic information')
# 设置y轴标签水平显示
plt.yticks(rotation=0)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('QCMyuyi.jpg', dpi=1000,bbox_inches = 'tight')
plt.show()

shap_feat = X.values
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(shap_feat, y_encoded)
explainer = shap.TreeExplainer(rf_classifier)
shap_values = explainer.shap_values(shap_feat)

list_of_2d_arrays = [shap_values[:, :, i] for i in range(3)]
# 绘制SHAP summary plot并保存为png
plt.figure()
shap.summary_plot(list_of_2d_arrays, shap_feat, feature_names=X.columns, class_names=label_encoder.classes_, show=False)
plt.xlabel('SHAP value')
plt.savefig('Qzongtissp.jpg', dpi=1000)
plt.show()

# 绘制每个类别的SHAP dot plot并保存为png
for i in range(3):
    plt.figure()
    shap.summary_plot(shap_values[:, :, i], X, plot_type="dot", show=False)
    # 根据编码值获取对应的原始类别名称来设置xlabel
    category_name = encoded_to_original[i]
    plt.xlabel(category_name)
    plt.savefig(f'Qshap{i}.jpg', dpi=1000)
    plt.show()
编码值: 0 对应原始值: Category_1
编码值: 1 对应原始值: Category_2
编码值: 2 对应原始值: Category_3
Average Accuracy on KFold: 0.6848051948051948
Average Precision on KFold: 0.7039113248430585
Average Recall on KFold: 0.6848051948051948
Average F1 Score on KFold: 0.6848679489989185
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [10]:
# K
In [11]:
df.rename(columns={'first_product_faraday_efficiency': 'First product faraday efficiency'}, inplace=True)
df['First product faraday efficiency'] = df['First product faraday efficiency'] / 100
print(df['First product faraday efficiency'])
0      0.9300
1      0.9100
2      0.3400
3      0.9330
4      0.8200
        ...  
368    0.4482
369    0.9220
370    0.9864
371    0.9000
372    0.4390
Name: First product faraday efficiency, Length: 373, dtype: float64
In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

embedding_columns = ['Material']
bool_column = ['Copper containing']

# 假设这里有定义好的df数据框
# 目标列为 first_product进行标签编码
le = LabelEncoder()
df['First product'] = le.fit_transform(df['first_product'])

# 定义特征列和目标列
X = df[embedding_columns + bool_column + ['Voltage', 'Electrolyte flow rate', 'MEA configuration',
                                          'Electrolyte', 'Membrane', 'Anode', 'CO₂ flow rate',
                                          'MEA area', 'Membrane area', 'Anode area',
                                          'Anode catalyst loading', 'Cathode area',
                                          'Cathode catalyst loading', 'Temperature', 'Electrolyte concentration', 'First product']]
y = df['First product faraday efficiency']

# 划分训练集和测试集
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=42)

# 初始化K折交叉验证
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 用于存储随机森林模型的评估指标结果
rf_mse_scores = []
rf_r2_scores = []
all_true_values = []
all_predicted_values = []

# 随机森林回归模型
rf_model = RandomForestRegressor(random_state=42)

for train_index, val_index in kf.split(X_trainval):
    X_train, X_val = X_trainval.iloc[train_index], X_trainval.iloc[val_index]
    y_train, y_val = y_trainval.iloc[train_index], y_trainval.iloc[val_index]
    rf_model.fit(X_train, y_train)
    y_val_pred = rf_model.predict(X_val)
    rf_mse_scores.append(mean_squared_error(y_val, y_val_pred))
    rf_r2_scores.append(r2_score(y_val, y_val_pred))
    all_true_values.extend(y_val)
    all_predicted_values.extend(y_val_pred)

# 计算平均均方误差和平均决定系数
average_mse = np.mean(rf_mse_scores)
average_r2 = np.mean(rf_r2_scores)

print(f"Random Forest - Average Mean Squared Error: {average_mse}")
print(f"Random Forest - Average Coefficient of determination: {average_r2}")

# 拟合直线(使用最小二乘法)
# 将列表转换为numpy数组,便于后续计算
all_true_values = np.array(all_true_values).reshape(-1, 1)
all_predicted_values = np.array(all_predicted_values).reshape(-1, 1)

# 创建线性回归对象并拟合数据
reg = linear_model.LinearRegression()
reg.fit(all_true_values, all_predicted_values)

# 得到拟合直线的斜率和截距
slope = reg.coef_[0][0]
intercept = reg.intercept_[0]

# 绘制散点图
plt.scatter(all_true_values, all_predicted_values)

# 绘制拟合直线
x_vals = np.array([min(all_true_values), max(all_true_values)])
y_vals = slope * x_vals + intercept
plt.plot(x_vals, y_vals, color='red')

plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.title('Scatter plot with semantic information')

plt.show()
# 在整个训练集上重新训练最终模型
final_rf_model = RandomForestRegressor(random_state=42)
final_rf_model.fit(X_trainval, y_trainval)

# 创建一个SHAP解释器对象
explainer = shap.TreeExplainer(final_rf_model)
# 计算SHAP值
shap_values = explainer.shap_values(X_test)
# 绘制SHAP dot plot并保存为png
plt.figure()
shap.summary_plot(shap_values, X_test, plot_type="dot", show=False)

plt.show()
Random Forest - Average Mean Squared Error: 0.026129074699390454
Random Forest - Average Coefficient of determination: 0.5671335390191871
No description has been provided for this image
No description has been provided for this image
In [ ]: